Data building script for grouping GD and PD estimates based on

Data description

Loci type in GD data
Var1 Freq
FCA 0.9290657
LEO 0.0034602
PAT 0.0017301
PBE 0.0034602
PCO 0.0294118
PTI 0.0034602
PUN 0.0294118

There are 135 population genetic studies and 314 population density studies in the raw datasets.

Number of distinct populations for each measure
sp Popden Popgen
Acinonyx jubatus 5 16
Caracal aurata 5 NA
Caracal caracal NA 1
Felis bieti NA 1
Felis chaus 1 1
Felis lybica NA 2
Felis silvestris 8 29
Herpailurus yagouaroundi NA 1
Leopardus colocolo 3 12
Leopardus geoffroyi 4 4
Leopardus guigna NA 10
Leopardus guttulus NA 4
Leopardus jacobita 2 5
Leopardus pardalis 30 7
Leopardus tigrinus 1 1
Leopardus wiedii 5 NA
Leptailurus serval 4 NA
Lynx canadensis 1 56
Lynx lynx 14 19
Lynx pardinus 4 3
Lynx rufus 5 60
Neofelis diardi 11 NA
Neofelis nebulosa 3 NA
Otocolobus manul 1 NA
Panthera leo 35 65
Panthera onca 53 43
Panthera pardus 93 24
Panthera tigris 64 31
Panthera uncia 13 17
Pardofelis marmorata 4 NA
Prionailurus bengalensis 10 1
Prionailurus rubiginosus 1 NA
Prionailurus viverrinus 4 NA
Puma concolor 18 108
Number of distinct countries in each species
sp Popden Popgen
Acinonyx jubatus 3 6
Caracal aurata 1 NA
Caracal caracal NA 1
Felis bieti NA 1
Felis chaus 1 1
Felis lybica NA 1
Felis silvestris 5 9
Herpailurus yagouaroundi NA 1
Leopardus colocolo 2 4
Leopardus geoffroyi 3 3
Leopardus guigna NA 2
Leopardus guttulus NA 1
Leopardus jacobita 2 3
Leopardus pardalis 8 3
Leopardus tigrinus 1 1
Leopardus wiedii 2 NA
Leptailurus serval 3 NA
Lynx canadensis 1 2
Lynx lynx 6 14
Lynx pardinus 2 1
Lynx rufus 2 2
Neofelis diardi 2 NA
Neofelis nebulosa 3 NA
Otocolobus manul 1 NA
Panthera leo 16 15
Panthera onca 12 10
Panthera pardus 21 8
Panthera tigris 11 9
Panthera uncia 7 8
Pardofelis marmorata 4 NA
Prionailurus bengalensis 5 1
Prionailurus rubiginosus 1 NA
Prionailurus viverrinus 3 NA
Puma concolor 6 6

Applying K-means clustering in each species

Example : tigers

j = "Panthera tigris"
## Subset to sp data -----------------------------------------------------------------------
sp_name <- sort(unique(gp_df$sp))
sp_df.i <- droplevels(gp_df[gp_df$sp %in% j,])
  1. Verify each row has long/lat. Remove duplicated coordinates as we need one point per localisation for clustering.
all(complete.cases(sp_df.i[,c("long","lat")]))

dups <- duplicated(sp_df.i[,c("long","lat","source")])
if (any(dups)) {
  print(paste(length((dups[dups])), "Dups detected"))
  sp_df <- sp_df.i[!dups,]
  sp_df.ii <- sp_df.i[dups,]
  } else sp_df <- sp_df.i
# - initialise column
  sp_df$cluster_den <- NA
  sp_df$cluster_gen <- NA
  sp_df$closest_ID <- NA
  sp_df$distance <- NA
  sp_df$cluster_center <- NA
  1. Overlaid biogeographic regions and apply clustering within each biogeo
# Biogeographical realms from https://www.sciencebase.gov/catalog/item/508fece8e4b0a1b43c29ca22
# biogeo <- readOGR("www_ter_ecos.shp")
points <- sp::SpatialPoints(coords=sp_df[,c("long","lat")], proj4string = sp::CRS("+proj=longlat +datum=WGS84 +no_defs"))
biogeo.point <- raster::extract(x=biogeo[,c("REALM")], y=points)

sp_df <- cbind(sp_df, biogeo.point[,c("REALM")])
nbiogeo <- table(sp_df$REALM)
nbiogeo

kk <- tapply(sp_df$source, sp_df$REALM, n_distinct)
table(sp_df$REALM, sp_df$source)
# Which biogeo has both GD and PD
sp_df <- subset(sp_df, REALM %in% names(which(kk == 2)))
sp_df$REALM <- droplevels(sp_df$REALM)
#subset to biogeo
for (k in names(kk)){
  bg_i <- which(sp_df$REALM == k)

# Check number of pop in each dataset
foo <- table(sp_df$source[bg_i])
print(k)
print(foo)
  1. Apply K-means clustering using GD and PD as initial centroids
if(length(foo) == 0) next
# using Popden coordinates as centers
cls_df <- sp_df[bg_i,]
den_cls <- kmeans(cls_df[,c("long","lat")], centers=cls_df[which(cls_df$source == "Popden"),c("long","lat")], trace=TRUE)

cls_df[,"cluster_den"] <- den_cls$cluster
cls_df$cluster_den <- paste0(cls_df$cluster_den,"_", cls_df$REALM)

# using Popgen coordinates as centers
gen_cls <- kmeans(cls_df[,c("long","lat")], centers=cls_df[which(cls_df$source == "Popgen"),c("long","lat")], trace=TRUE)
cls_df[,"cluster_gen"] <- gen_cls$cluster
cls_df$cluster_gen <- paste0(cls_df$cluster_gen,"_",cls_df$REALM)

sp_df[bg_i, "cluster_den"] <- cls_df$cluster_den
sp_df[bg_i, "cluster_gen"] <- cls_df$cluster_gen
  1. Join unassigned data points to the nearest unassigned point
## based on gen
bg_df <- sp_df[bg_i, ]
# - calculate distances between the nearest points -Popgen <-> Popden, (in meters)
xx <- bg_df[which(bg_df$source == "Popgen"),c("long","lat")]
yy <- bg_df[which(bg_df$source == "Popden"),c("long","lat")]
mat_gen2den <- geosphere::distm(xx, 
                                yy)
# get distance
bg_df[which(bg_df$source == "Popgen"),"distance"] <- apply(mat_gen2den, 1, min)
bg_df[which(bg_df$source == "Popden"),"distance"] <- apply(mat_gen2den, 2, min)

# get id of the closest and store it in bg_df$closest_ID
bg_df$closest_ID[which(bg_df$source == "Popden")] <- bg_df$ID[which(bg_df$source == "Popgen")][apply(mat_gen2den, 2, which.min)]
bg_df$closest_ID[which(bg_df$source == "Popgen")] <- bg_df$ID[which(bg_df$source == "Popden")][apply(mat_gen2den, 1, which.min)]

# - assign to the nearest cluster (not given by kmeans)
# sort df in ascending order of the distance
bg_df <- bg_df[order(bg_df$distance),]
# by row operation
# modify clus_den
for (i in 1:nrow(bg_df)){
    x <- bg_df[i, "ID"]
      # refresh cluster info
    b <- xtabs(~cluster_den+source, data=bg_df)
    nclus.d <- which(rowSums(b > 0) == 2)
    
    clos_id <- bg_df[bg_df$ID == x,"closest_ID"]
        if(!bg_df$cluster_den[bg_df$ID == clos_id] %in% names(nclus.d)){
      
      bg_df$cluster_den[bg_df$ID == x] <- bg_df[bg_df$ID == clos_id,"cluster_den"]
    }
}
# modify clus_gen
for (i in 1:nrow(bg_df)){
    x <- bg_df[i, "ID"]
      # refresh cluster info
    b <- xtabs(~cluster_gen+source, data=bg_df)
    nclus.g <- which(rowSums(b > 0) == 2)
    clos_id <- bg_df[bg_df$ID == x,"closest_ID"]
        if(!bg_df$cluster_gen[bg_df$ID == clos_id] %in% names(nclus.g)){
          bg_df$cluster_gen[bg_df$ID == x] <- bg_df[bg_df$ID == clos_id,"cluster_gen"]
    }
}

sp_df[bg_i,] <- bg_df
}
if(any(dups)){
  poo <- merge(sp_df.ii[,c("long","lat","source")], 
               sp_df[,c("long","lat","source","cluster_den","cluster_gen","cluster_center", "closest_ID","distance")], all.x=T, all.y=F)
  poop <- merge(sp_df.ii, unique(poo))
  sp_df <- rbind(sp_df, poop)
}
  1. Check if joined data are within species’ dispersal distance
# Sigma data -----------------------------------------------------------
sigma_df <- read.csv("Data/Sigma_data.csv")

Sigma data details:

sp_df <- merge(sp_df, sigma_df, by="sp", all.x=T)
sp_df$distance.lowerthanDI <- sp_df$distance <= sp_df$DI*1e3
  1. Merge GD and PD data based on clusters
gendata_sp <- subset(gen, sp %in% j)
dendata_sp <- subset(den, sp %in% j)
clus_sp <- subset(sp_df, sp %in% j)
clus_sp <- clus_sp[which(sp_df$distance.lowerthanDI == TRUE),]
    
    # Separate data based on source
    clus_sp_gen <- clus_sp[which(clus_sp$source == "Popgen"),c("country","long","lat","cluster_center","distance.lowerthanDI")]
    clus_sp_gen <- unique(clus_sp_gen)
    
    clus_sp_den <- clus_sp[which(clus_sp$source == "Popden"),c("ref","country","long","lat","cluster_center","distance.lowerthanDI")]
    clus_sp_den <- unique(clus_sp_den)

    x <- dplyr::left_join(gendata_sp,clus_sp_gen, by=c("country","long","lat"))
    if(any(is.na(x$cluster_center))) x <- x[!is.na(x$cluster_center),]
    xl[[j]] <- x
    
    y <- dplyr::left_join(clus_sp_den, dendata_sp)
    if(any(is.na(y$cluster_center))) y <- y[!is.na(y$cluster_center),]
    y$cluster_center <- droplevels(as.factor(y$cluster_center))
    yl[[j]] <- y

# # Calculate the DP average across studies in each cluster_center before merging with the GD
    DP_mean <- aggregate(dp ~ cluster_center, data = y, FUN=mean, na.action=na.omit)
    DP_max <- aggregate(dp ~ cluster_center, data = y, FUN=max, na.action=na.omit)
    DP_sd <- aggregate(dp ~ cluster_center, data = y, FUN=sd, na.action=na.omit)
    DP_n <- aggregate(dp ~ cluster_center, data = y, FUN=n_distinct, na.action=na.omit)
    DP_median <- aggregate(dp ~ cluster_center, data = y, FUN=median, na.action=na.omit)
    
    DP_mean <- data.frame(cluster_center=DP_max[,1], DP_max=DP_max[,2], DP_mean=DP_mean[,2], DP_sd=DP_sd[,2], DP_n=DP_n[,2], DP_median=DP_median[,2])
    
    DP_ref <- tapply(y$ref, y$cluster_center, unique)
    DP_ref <- lapply(DP_ref, paste, collapse=", ")
    DP_ref <- do.call(rbind,DP_ref)
    DP_ref <- data.frame(ref=DP_ref, cluster_center=rownames(DP_ref))
    
    DP_mean <- merge(DP_mean, DP_ref, all.x=T)
    
    DP_pop <- tapply(y$population, y$cluster_center, unique)
    DP_pop <- lapply(DP_pop, paste, collapse=", ")
    DP_pop <- do.call(rbind,DP_pop)
    DP_pop <- data.frame(population=DP_pop, cluster_center=rownames(DP_pop))
    
    
    DP_mean <- merge(DP_mean, DP_pop, all.x=T)
    
    gen_dp <- dplyr::left_join(x, DP_mean, by="cluster_center", suffix=c(".gen",".den"))

Visualise the clustering results

By species

Acinonyx jubatus

Felis chaus

Felis silvestris

Leopardus colocolo

Leopardus geoffroyi

Leopardus jacobita

Leopardus pardalis

Leopardus tigrinus

Lynx canadensis

Lynx lynx

Lynx pardinus

Lynx rufus

Panthera leo

Panthera onca

Panthera pardus

Panthera tigris

Panthera uncia

Prionailurus bengalensis

Puma concolor

Data summary

Table S1 : Data summarised for each species in cluster data set
Species Density records Density studies Genetic div. records Genetic div. studies Number of clusters with density and genetic records
Acinonyx jubatus 7 3 2 2 2
Felis silvestris 19 9 6 4 6
Leopardus colocolo 4 2 2 1 2
Leopardus jacobita 4 2 2 1 2
Leopardus pardalis 15 3 11 4 3
Lynx canadensis 1 1 1 1 1
Lynx lynx 43 7 7 2 5
Lynx pardinus 4 3 4 2 3
Lynx rufus 12 4 5 2 4
Panthera leo 106 12 51 11 23
Panthera onca 99 25 44 13 23
Panthera pardus 66 26 20 13 18
Panthera tigris 150 31 33 19 19
Panthera uncia 30 13 14 5 8
Puma concolor 29 12 17 11 13
Table S1 : Data summarised for each species in country data set
Species Density records Density studies Genetic div. records Genetic div. studies Number of countries with density and genetic records
Acinonyx jubatus 12 4 12 5 2
Felis chaus 1 1 1 1 1
Felis silvestris 19 8 13 5 4
Leopardus colocolo 5 3 6 1 2
Leopardus geoffroyi 10 4 4 2 3
Leopardus jacobita 4 2 4 1 2
Leopardus pardalis 18 8 12 4 3
Leopardus tigrinus 2 1 1 1 1
Lynx canadensis 1 1 47 6 1
Lynx lynx 30 8 3 1 3
Lynx pardinus 5 4 4 2 1
Lynx rufus 12 4 61 8 1
Panthera leo 61 15 75 14 12
Panthera onca 92 35 50 15 9
Panthera pardus 199 43 24 15 7
Panthera tigris 153 61 35 19 9
Panthera uncia 33 12 12 5 6
Puma concolor 26 14 120 21 5

There are 683 PD and 484 GD records from a total of 354 publications, across 15 and 18 in cluster and country dataset respectively.

Global map of cluster

(as in Figure 1)